Attempt to determine the top skills for a data scientist based on job listings for data scientist. Different text mining techniques are used to identify skills from the description section of the job listings.
# Read in CSV file of the 10000 jobs listings for Data Scientist
jobs_df <- read.csv(file = 'data_scientist_united_states_job_postings_jobspikr.csv', stringsAsFactors = FALSE)
# For testing purposes, just the first 100 rows only
jobs_df <- jobs_df[1:100,]
# Output all the column names to confirm
names(jobs_df)
## [1] "crawl_timestamp" "url" "job_title"
## [4] "category" "company_name" "city"
## [7] "state" "country" "inferred_city"
## [10] "inferred_state" "inferred_country" "post_date"
## [13] "job_description" "job_type" "salary_offered"
## [16] "job_board" "geo" "cursor"
## [19] "contact_email" "contact_phone_number" "uniq_id"
## [22] "html_job_description"
# Just confirming the class is character for all these fields
#class(jobs_df$job_title)
#class(jobs_df$category)
#class(jobs_df$job_description)
#class(jobs_df$job_type)
#class(jobs_df$job_board)
Title is not that valueable as many of the titles are “Data Scientist” with some containing Sr. or I, or some derivative name, so will be ignoring for the analysis.
# Following the NASA case study: https://www.tidytextmining.com/nasa.html
# Set up separate tidy data frames for job_title, category, job_description, job_type while keeping the uniq_id for each so we can connect later
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
jobs_title <- tibble(id = jobs_df$uniq_id, title = jobs_df$job_title)
jobs_title %>% print(n=10)
## # A tibble: 100 x 2
## id title
## <chr> <chr>
## 1 3b6c6acfcba6135a31c83b… Enterprise Data Scientist I
## 2 741727428839ae7ada852e… Data Scientist
## 3 cdc9ef9a1de327ccdc19cc… Data Scientist
## 4 1c8541cd2c2c924f9391c7… Data Scientist, Aladdin Wealth Tech, Associate …
## 5 445652a560a54410608578… Senior Data Scientist
## 6 9571ec617ba209fd9a4f84… CIB – Fixed Income Research – Machine Learning …
## 7 0ec629c03f3e82651711f2… Data Scientist, Licensing Operations
## 8 972e897473d65f34b8e7f1… Sr. Data Scientist (Can work on Xoriant W2)
## 9 80d64b46bc7c89602f63da… Data Scientist, Aladdin Wealth Tech, Associate
## 10 b772c6ef8ee7631895ab9a… Data Scientist
## # … with 90 more rows
Job description is the most important field in the dataset. Contains the complete write-up posted for the job listing.
jobs_desc <- tibble(id = jobs_df$uniq_id,
desc = jobs_df$job_description)
jobs_desc %>%
select(desc) %>%
sample_n(5)
## # A tibble: 5 x 1
## desc
## <chr>
## 1 This position requires corporate experience and either US Green Card or …
## 2 My client is a leader in the Manufacturing vertical and has operations i…
## 3 "Read what people are saying about working here. \n\nCompany: FedEx Serv…
## 4 "Read what people are saying about working here. \n\nRESPONSIBILITIES\n\…
## 5 Our client is currently seeking a Data Scientist - C2H Main Criteria: Pr…
Category is interesting info. Used it for the NASA case study, not sure if the results are really telling of anything, but leaving in for now.
jobs_cat <- tibble(id = jobs_df$uniq_id,
category = jobs_df$category)
jobs_cat <- jobs_cat %>% filter(jobs_cat$category != "") %>% print(n=100)
## # A tibble: 57 x 2
## id category
## <chr> <chr>
## 1 3b6c6acfcba6135a31c83bd7ea493b18 Accounting/Finance
## 2 1c8541cd2c2c924f9391c7d3f526f64e Accounting/Finance
## 3 445652a560a5441060857853cf267470 biotech
## 4 9571ec617ba209fd9a4f842973a4e9c8 Accounting/Finance
## 5 0ec629c03f3e82651711f2626c23cadb Accounting/Finance
## 6 80d64b46bc7c89602f63daf06b9f1b4c Accounting/Finance
## 7 9cd3ed78e5cac9e516ea41173de2c25f Computer/Internet
## 8 53224da901548e7137bbb163d456ba6a Computer/Internet
## 9 010b5d671896d26eba50948c7c337f94 Computer/Internet
## 10 4a8b875d1acc4f560716561d699aa022 Computer/Internet
## 11 edcdf4dd38cca4bbccd5ba0b787d2c49 Computer/Internet
## 12 303c2bd509a4d2c2da619f68fe4b28f0 Arts/Entertainment/Publishing
## 13 2956d0023b0c4617c430adcedb1e38d9 Computer/Internet
## 14 e457f78180e72dd1ddee0efc042b0496 Computer/Internet
## 15 8d220fbda8e4ab36f10bba4ec8ff20ad military
## 16 b6e361550638f071a4985bf5f3d440ce business and financial operations
## 17 d33577ea9ae09c58d77e1fab2c012ba2 business and financial operations
## 18 09378ddde9b997b1acbf519c2b9ddf03 business and financial operations
## 19 eb3e1cdd65a86ef8b4cf28094f0c785b business and financial operations
## 20 1b148bd2e730d37c7afd2cbf0e7e7824 business and financial operations
## 21 6354c9293fd5867c41b9f0ca305cd163 business and financial operations
## 22 acfd50bf4d44eb476ec69b38348355be business and financial operations
## 23 674c331993a36bb28fd4cf302ce66e9d business and financial operations
## 24 87c7330ffd4a65d432eee4ca5955b971 Engineering/Architecture
## 25 2775ed23918410617dbc6cad9886e455 Engineering/Architecture
## 26 cbf0b6e8a77c62543a82b26fb06b34da Manufacturing/Mechanical
## 27 1f57eb4c560bbf2bbc780ec6274acaeb Engineering/Architecture
## 28 05116b62a369b96b2fa5d48c87e9310f Engineering/Architecture
## 29 6d5c369d0649aef334336592d5d502e0 Manufacturing/Mechanical
## 30 9c43f943b0210574243390fd774df9f3 life physical and social science
## 31 ea67f8f42a6fa593ce7acefddd8aef2e Computer/Internet
## 32 6393773aacaec4432bd091d3fda500b0 Computer/Internet
## 33 b0f7650425cbbbf4a8161af888c46359 Computer/Internet
## 34 7297f4499b50a131d8878f1bc99bfb0e Computer/Internet
## 35 d6c3c25cd9c3578291bceb0dfc3a5c68 Computer/Internet
## 36 e0f3ff447e5b10caf54a33e503d839e7 Arts/Entertainment/Publishing
## 37 de26a41473d841eb4f4e6c89a9c0c1fb Banking/Loans
## 38 33da39965c50a7044ab1da5586fc8454 agriculture and fishing
## 39 bfb0fdec92e9d40b47a26cec6b1fe457 business and financial operations
## 40 c8909fd5d13d781e4ad9d8708a877199 business and financial operations
## 41 78e276415437c529dd0a82c8d1b7df25 business and financial operations
## 42 f58d6adb2373f5349ef055e7855deafe business and financial operations
## 43 eab636ef77254c7aa93df150cfa55a19 business and financial operations
## 44 a7c13d21920e860ddaefbb3085eece14 business and financial operations
## 45 daab34b7684bdd0a27c3ccb991005d53 Education/Training
## 46 41083b020b17f0394c96ebc271095082 Engineering/Architecture
## 47 33a223e13f1902bc094a6be5e9aff6b6 Engineering/Architecture
## 48 6644ceb685aa5f434a907000d5cfbc8e Engineering/Architecture
## 49 c1bdfdd34b1b67bd435f83d87004fbe5 Engineering/Architecture
## 50 37c4c80252fb381734614164b6bf2c9b Engineering/Architecture
## 51 248276268f2f4779ae399f54962b9a84 Engineering/Architecture
## 52 30f9bad2109e8d12c0e42c90622fc2cb Engineering/Architecture
## 53 689e911a7b842c0942901f00924bd07c Manufacturing/Mechanical
## 54 8ec37358cbf6691fd30c21a5f1de1d8a science
## 55 e682d47f650976c7cfd182e10656003e Engineering/Architecture
## 56 d86cb7aad36562ab4a58c67fd37a4e95 Engineering/Architecture
## 57 99045ef17e641c9631cc2d5939e3fa0c Engineering/Architecture
Type info not valuable, as it’s just full-time, contract, etc.
jobs_type <- tibble(id = jobs_df$uniq_id,
category = jobs_df$job_type)
jobs_type %>% print(n=10)
## # A tibble: 100 x 2
## id category
## <chr> <chr>
## 1 3b6c6acfcba6135a31c83bd7ea493b18 Undefined
## 2 741727428839ae7ada852eebef29b0fe Undefined
## 3 cdc9ef9a1de327ccdc19cc0d07dbbb37 Full Time
## 4 1c8541cd2c2c924f9391c7d3f526f64e Undefined
## 5 445652a560a5441060857853cf267470 Full Time
## 6 9571ec617ba209fd9a4f842973a4e9c8 Undefined
## 7 0ec629c03f3e82651711f2626c23cadb Undefined
## 8 972e897473d65f34b8e7f1c1b4c74b1c Contract
## 9 80d64b46bc7c89602f63daf06b9f1b4c Undefined
## 10 b772c6ef8ee7631895ab9a59b5e8b2c1 Contract
## # … with 90 more rows
Attempted to make a list of keywords, which would essentially be the list of skills. Wanted to see if I created this list and then ran it against the job description if there would be a way to count or correlate the two. But since there is no ID as from the dataset, this list has proven not useful. Leaving in for now.
keywords <- c("python", "sql", "modeling", "statistics", "algorithms", "r", "visualization", "hadoop", "mining", "communication", "aws", "spark", "artificial intelligence", "machine learning", "sas", "cloud", "innovative", "driven", "optimization", "java", "databases", "leadership", "security", "tableau", "phd", "education", "degree", "hive", "ml", "scala", "ms", "economics", "neural", "verbal", "transformation", "culture", "tensorflow", "automation", "azure", "nlp", "architecture", "nosql", "scripting", "passionate", "agile", "bachelor\'s", "clustering", "pandas", "bs")
jobs_kw <- tibble(keyword = keywords)
jobs_kw
## # A tibble: 49 x 1
## keyword
## <chr>
## 1 python
## 2 sql
## 3 modeling
## 4 statistics
## 5 algorithms
## 6 r
## 7 visualization
## 8 hadoop
## 9 mining
## 10 communication
## # … with 39 more rows
From the job_description, tokenize all the words and remove “stop_words” which are common words in the English language to allow for focus on meaningful words of the job listing.
# Use tidytext’s unnest_tokens() for the description field so we can do the text analysis.
# unnest_tokens() will tokenize all the words in the description field and create a tidy dataframe of the word by identifer
library(tidytext)
jobs_desc <- jobs_desc %>%
unnest_tokens(word, desc) %>%
anti_join(stop_words)
## Joining, by = "word"
jobs_desc
## # A tibble: 23,310 x 2
## id word
## <chr> <chr>
## 1 3b6c6acfcba6135a31c83bd7ea493b18 read
## 2 3b6c6acfcba6135a31c83bd7ea493b18 people
## 3 3b6c6acfcba6135a31c83bd7ea493b18 farmers
## 4 3b6c6acfcba6135a31c83bd7ea493b18 join
## 5 3b6c6acfcba6135a31c83bd7ea493b18 team
## 6 3b6c6acfcba6135a31c83bd7ea493b18 diverse
## 7 3b6c6acfcba6135a31c83bd7ea493b18 professionals
## 8 3b6c6acfcba6135a31c83bd7ea493b18 farmers
## 9 3b6c6acfcba6135a31c83bd7ea493b18 acquire
## 10 3b6c6acfcba6135a31c83bd7ea493b18 skills
## # … with 23,300 more rows
Provide count in table form of the most common words in the job descriptions.
# Most common words in the description field
jobs_desc %>%
count(word, sort = TRUE) %>% print(n=10)
## # A tibble: 3,824 x 2
## word n
## <chr> <int>
## 1 data 971
## 2 experience 455
## 3 business 200
## 4 learning 198
## 5 skills 183
## 6 team 178
## 7 science 177
## 8 models 163
## 9 machine 155
## 10 analysis 152
## # … with 3,814 more rows
Added more words to be removed, certainly not exhaustive at this point, but leaving in so can me added to.
# added in extra stop words to remove the noise of words in the descriptions
extra_stopwords <- tibble(word = c(as.character(1:10),
"2", "job", "company", "e.g", "religion", "origin", "color", "gender", "2019", "1999"))
jobs_desc <- jobs_desc %>%
anti_join(extra_stopwords)
## Joining, by = "word"
jobs_desc %>%
count(word, sort = TRUE) %>% print(n=10)
## # A tibble: 3,806 x 2
## word n
## <chr> <int>
## 1 data 971
## 2 experience 455
## 3 business 200
## 4 learning 198
## 5 skills 183
## 6 team 178
## 7 science 177
## 8 models 163
## 9 machine 155
## 10 analysis 152
## # … with 3,796 more rows
Applied the stemming of words to in essence combine words of the same root, but I’m not sure if it’s valuable at this point. Leaving in for now.
library(SnowballC)
# Stemming the words, and let's see what top 10 come out as not too useful
# from https://abndistro.com/post/2019/02/10/tidy-text-mining-in-r/#stemming
jobs_desc %>%
mutate(word_stem = SnowballC::wordStem(word)) %>%
count(word_stem, sort = TRUE) %>% print(n=10)
## # A tibble: 2,676 x 2
## word_stem n
## <chr> <int>
## 1 data 971
## 2 experi 479
## 3 model 309
## 4 team 255
## 5 learn 233
## 6 develop 224
## 7 busi 212
## 8 skill 202
## 9 analyt 201
## 10 statist 192
## # … with 2,666 more rows
# Not very helpful as of yet, definitely stemming words, but then it's too generic
Just a list of skills based on the output of the count of common words in the job description after removing the stop_words
Skills: python, sql, modeling, statistics, algorithms, r, visualization, hadoop, mining, communication, aws, spark, artificial intelligence, machine learning, sas, cloud, innovative, driven, optimization, java, databases, leadership, security, tableau, phd, education, degree, hive, ml, scala, ms, economics, neural, verbal, transformation, culture, tensorflow, automation, azure, nlp, architecture, nosql, scripting, passionate, agile, bachelor’s, clustering, pandas, bs
Applying lowercase to all the words to ensure different cases aren’t problematic
# lowercase all the words just to make sure there's no redundancy
jobs_desc <- jobs_desc %>%
mutate(word = tolower(word))
Count the number of occurrences two words appear together in the description field. This does not mean the two words are next to each other, just that they appear in the same description field.
# Word co-ocurrences and correlations
# Count how many times each pair of words occurs together in the description field.
library(widyr)
desc_word_pairs <- jobs_desc %>%
pairwise_count(word, id, sort = TRUE, upper = FALSE)
desc_word_pairs
## # A tibble: 944,637 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 data experience 91
## 2 data python 81
## 3 experience python 80
## 4 skills data 74
## 5 data learning 73
## 6 team data 72
## 7 skills experience 72
## 8 experience learning 72
## 9 data analysis 71
## 10 data scientist 70
## # … with 944,627 more rows
Plot network of the co-occuring words.
# Plot networks of these co-occuring words
library(ggplot2)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
set.seed(1234)
desc_word_pairs %>%
filter(n >= 50) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Output the correlation of the word pairs.
# Find correlation among the words in the description field
# This looks for those words that are more likely to occur together than with other words for a dataset.
desc_cors <- jobs_desc %>%
group_by(word) %>%
filter(n() >= 50) %>%
pairwise_cor(word, id, sort = TRUE, upper = FALSE)
desc_cors %>% print(n=10)
## # A tibble: 1,953 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 machine learning 0.905
## 2 science computer 0.570
## 3 advanced techniques 0.522
## 4 computer statistics 0.493
## 5 teams role 0.481
## 6 science statistics 0.481
## 7 develop responsibilities 0.477
## 8 preferred systems 0.472
## 9 analytics management 0.471
## 10 degree related 0.465
## # … with 1,943 more rows
# Skipping the rest of section 8.2
Calculating the term frequency times inverse document frequency.
# Calculating tf-idf for the description fields
# we can use tf-idf, the term frequency times inverse document frequency, to identify words that are especially important to a document within a collection of documents.
desc_tf_idf <- jobs_desc %>%
count(id, word, sort = TRUE) %>%
ungroup() %>%
bind_tf_idf(word, id, n)
desc_tf_idf %>% filter(n >= 10) %>%
arrange(-tf_idf)
## # A tibble: 73 x 6
## id word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 0ec629c03f3e82651711f2626c23cadb licensing 14 0.0606 4.61 0.279
## 2 37c4c80252fb381734614164b6bf2c9b fedex 12 0.0381 4.61 0.175
## 3 6393773aacaec4432bd091d3fda500b0 dmi 11 0.0282 4.61 0.130
## 4 674c331993a36bb28fd4cf302ce66e9d clinical 17 0.0344 2.66 0.0915
## 5 a7c13d21920e860ddaefbb3085eece14 clinical 17 0.0344 2.66 0.0915
## 6 40ca3fab8f31010d1b5f35877508c0bf cloud 10 0.0325 2.12 0.0688
## 7 972e897473d65f34b8e7f1c1b4c74b1c cloud 10 0.0319 2.12 0.0677
## 8 674c331993a36bb28fd4cf302ce66e9d review 11 0.0223 2.66 0.0592
## 9 a7c13d21920e860ddaefbb3085eece14 review 11 0.0223 2.66 0.0592
## 10 85901569bb70a9ccb272208c19ef043a research 11 0.0447 1.17 0.0524
## # … with 63 more rows
# These are the most important words in the description fields as measured by tf-idf, meaning they are common but not too common.
Combining the data frame of the TF_IDF from the job descriptions with the categories. Joining by the unique ID as key.
By the categories, this will identify the most important words from the job descriptions.
#8.3.2
# Try it with the category, not sure if how much value that will be, but want to try it
desc_tf_idf <- full_join(desc_tf_idf, jobs_cat, by = "id")
desc_tf_idf %>%
filter(!near(tf, 1)) %>%
filter(category %in% c("Accounting/Finance", "biotech",
"Computer/Internet", "Arts/Entertainment/Publishing",
"military", "business and financial operations",
"Engineering/Architecture", "Manufacturing/Mechanical",
"life physical and social science", "Banking/Loans",
"agriculture and fishing ", "Education/Training",
"science")) %>%
arrange(desc(tf_idf)) %>%
group_by(category) %>%
distinct(word, category, .keep_all = TRUE) %>%
top_n(15, tf_idf) %>%
ungroup() %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
ggplot(aes(word, tf_idf, fill = category)) +
geom_col(show.legend = FALSE) +
facet_wrap(~category, ncol = 3, scales = "free") +
coord_flip() +
labs(title = "Highest tf-idf words in DS job listing description fields",
caption = "From jobpickr dataset",
x = NULL, y = "tf-idf")
For topic modeling, create a document term matrix.
Initially, the word count by document ID.
# 8.4 Topic Modeling
# Casting to a document-term matrix
word_counts <- jobs_desc %>%
count(id, word, sort = TRUE) %>%
ungroup()
word_counts %>% print(n=10)
## # A tibble: 16,114 x 3
## id word n
## <chr> <chr> <int>
## 1 674c331993a36bb28fd4cf302ce66e9d data 41
## 2 a7c13d21920e860ddaefbb3085eece14 data 41
## 3 625160c64fec3702a8c0c7ae97e77825 data 27
## 4 c10862d80f4bcf1b0b72bfb7f861776f data 27
## 5 99045ef17e641c9631cc2d5939e3fa0c data 25
## 6 de26a41473d841eb4f4e6c89a9c0c1fb data 25
## 7 eda91b88eb3096ed98bc1a5f6b5568df data 25
## 8 40ca3fab8f31010d1b5f35877508c0bf data 24
## 9 cbf0b6e8a77c62543a82b26fb06b34da data 23
## 10 972e897473d65f34b8e7f1c1b4c74b1c data 22
## # … with 1.61e+04 more rows
Now construct the document-term matrix. High level of sparsity. From the case study: “Each non-zero entry corresponds to a certain word appearing in a certain document.”Each non-zero entry corresponds to a certain word appearing in a certain document."
# Construct DTM
desc_dtm <- word_counts %>%
cast_dtm(id, word, n)
desc_dtm
## <<DocumentTermMatrix (documents: 100, terms: 3806)>>
## Non-/sparse entries: 16114/364486
## Sparsity : 96%
## Maximal term length: 23
## Weighting : term frequency (tf)
From wikipedia: In natural language processing, the latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.
# 8.4.2 Topic modeling
library(topicmodels)
# be aware that running this model is time intensive
# Define there to be 16 topics. I have entered 13 categories, so I figured, 16 is 2^4, and close to 13, so why not.
desc_lda <- LDA(desc_dtm, k = 16, control = list(seed = 1234))
desc_lda
## A LDA_VEM topic model with 16 topics.
Tidy the resulting topics.
# Interpreting the data model
tidy_lda <- tidy(desc_lda)
tidy_lda
## # A tibble: 60,896 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 data 0.0303
## 2 2 data 0.0286
## 3 3 data 0.0189
## 4 4 data 0.0349
## 5 5 data 0.0363
## 6 6 data 0.0571
## 7 7 data 0.0382
## 8 8 data 0.0660
## 9 9 data 0.0620
## 10 10 data 0.0260
## # … with 60,886 more rows
Identify the top 10 terms for each topic.
# Top 10 Terms
top_terms <- tidy_lda %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 166 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 data 0.0303
## 2 1 experience 0.0107
## 3 1 team 0.0101
## 4 1 business 0.00976
## 5 1 learning 0.00946
## 6 1 models 0.00931
## 7 1 skills 0.00908
## 8 1 science 0.00866
## 9 1 quantitative 0.00811
## 10 1 engineering 0.00737
## # … with 156 more rows
Plot the top 10 terms for each topic. Interesting to see how the words break out, even though the topics are anonymous, only identified by number.
From case study: The topic modeling process has identified groupings of terms that we can understand as human readers of these description fields.
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_reordered() +
labs(title = "Top 10 terms in each LDA topic",
x = NULL, y = expression(beta)) +
facet_wrap(~ topic, ncol = 4, scales = "free")
Now calculate gamma, which will define the probability that each document belongs in each topic
Below graph isn’t useful to me.
# LDA gamma
lda_gamma <- tidy(desc_lda, matrix = "gamma")
lda_gamma
## # A tibble: 1,600 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 674c331993a36bb28fd4cf302ce66e9d 1 0.0000279
## 2 a7c13d21920e860ddaefbb3085eece14 1 0.0000279
## 3 625160c64fec3702a8c0c7ae97e77825 1 0.0000496
## 4 c10862d80f4bcf1b0b72bfb7f861776f 1 0.0000327
## 5 99045ef17e641c9631cc2d5939e3fa0c 1 0.0000422
## 6 de26a41473d841eb4f4e6c89a9c0c1fb 1 0.0000289
## 7 eda91b88eb3096ed98bc1a5f6b5568df 1 0.0000226
## 8 40ca3fab8f31010d1b5f35877508c0bf 1 0.0000448
## 9 cbf0b6e8a77c62543a82b26fb06b34da 1 0.0000327
## 10 972e897473d65f34b8e7f1c1b4c74b1c 1 0.0000441
## # … with 1,590 more rows
# Distribution of probabilities for all topics
ggplot(lda_gamma, aes(gamma)) +
geom_histogram() +
scale_y_log10() +
labs(title = "Distribution of probabilities for all topics",
y = "Number of documents", x = expression(gamma))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 4 rows containing missing values (geom_bar).
Below graph isn’t useful to me.
# Distribution of probability for each topic
ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
geom_histogram(show.legend = FALSE) +
facet_wrap(~ topic, ncol = 4) +
scale_y_log10() +
labs(title = "Distribution of probability for each topic",
y = "Number of documents", x = expression(gamma))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 410 rows containing missing values (geom_bar).
discover which categories are associated with which topic.
# 8.4.4
lda_gamma <- full_join(lda_gamma, jobs_cat, by = c("document" = "id"))
lda_gamma
## # A tibble: 1,600 x 4
## document topic gamma category
## <chr> <int> <dbl> <chr>
## 1 674c331993a36bb28fd4cf302ce… 1 2.79e-5 business and financial oper…
## 2 a7c13d21920e860ddaefbb3085e… 1 2.79e-5 business and financial oper…
## 3 625160c64fec3702a8c0c7ae97e… 1 4.96e-5 <NA>
## 4 c10862d80f4bcf1b0b72bfb7f86… 1 3.27e-5 <NA>
## 5 99045ef17e641c9631cc2d5939e… 1 4.22e-5 Engineering/Architecture
## 6 de26a41473d841eb4f4e6c89a9c… 1 2.89e-5 Banking/Loans
## 7 eda91b88eb3096ed98bc1a5f6b5… 1 2.26e-5 <NA>
## 8 40ca3fab8f31010d1b5f3587750… 1 4.48e-5 <NA>
## 9 cbf0b6e8a77c62543a82b26fb06… 1 3.27e-5 Manufacturing/Mechanical
## 10 972e897473d65f34b8e7f1c1b4c… 1 4.41e-5 <NA>
## # … with 1,590 more rows
top_cats <- lda_gamma %>%
filter(gamma > 0.5) %>%
count(topic, category, sort = TRUE)
top_cats
## # A tibble: 58 x 3
## topic category n
## <int> <chr> <int>
## 1 16 <NA> 7
## 2 15 <NA> 6
## 3 3 <NA> 5
## 4 1 <NA> 4
## 5 15 business and financial operations 4
## 6 6 <NA> 3
## 7 11 <NA> 3
## 8 14 <NA> 3
## 9 2 Engineering/Architecture 2
## 10 4 Accounting/Finance 2
## # … with 48 more rows
Little hard to decpiher, but I think connecting this plot to the previous plot of the topics would then connect which words (skills) are meaningful by category.
# One more graph from 8.4.4
top_cats %>%
group_by(topic) %>%
top_n(5, n) %>%
ungroup %>%
mutate(category = reorder_within(category, n, topic)) %>%
ggplot(aes(category, n, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
labs(title = "Top categories for each LDA topic",
x = NULL, y = "Number of documents") +
coord_flip() +
scale_x_reordered() +
facet_wrap(~ topic, ncol = 4, scales = "free")
Create the dataframe of the first 100 rows
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.3
## ✔ tidyr 1.0.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tibble::as_data_frame() masks igraph::as_data_frame(), dplyr::as_data_frame()
## ✖ purrr::compose() masks igraph::compose()
## ✖ tidyr::crossing() masks igraph::crossing()
## ✖ dplyr::filter() masks stats::filter()
## ✖ igraph::groups() masks dplyr::groups()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::simplify() masks igraph::simplify()
library(quanteda)
## Package version: 2.0.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:igraph':
##
## as.igraph
## The following object is masked from 'package:utils':
##
## View
# See below for how this works
# https://www.r-bloggers.com/advancing-text-mining-with-r-and-quanteda/
jobs_df <- read.csv(file = 'data_scientist_united_states_job_postings_jobspikr.csv', stringsAsFactors = FALSE)
# For just testing purposes, just the first 100 rows only
jobs_df <- jobs_df[1:100,]
jobs_df <- jobs_df[,c('uniq_id', 'job_description')]
class(jobs_df)
## [1] "data.frame"
Create a corpus based on the unique ID and the job description
# Generate a corpus
my_corpus <- corpus(jobs_df, docid_field = "uniq_id", text_field = "job_description")
#my_corpus
mycorpus_stats <- summary(my_corpus)
head(mycorpus_stats)
## Text Types Tokens Sentences
## 1 3b6c6acfcba6135a31c83bd7ea493b18 263 476 11
## 2 741727428839ae7ada852eebef29b0fe 104 176 8
## 3 cdc9ef9a1de327ccdc19cc0d07dbbb37 65 83 1
## 4 1c8541cd2c2c924f9391c7d3f526f64e 312 592 14
## 5 445652a560a5441060857853cf267470 276 465 13
## 6 9571ec617ba209fd9a4f842973a4e9c8 331 686 23
Preprocess the text. Remove numbers, remove punctuation, remove symbols, remove URLs, split hyphens. Because the blog entry included it, I kept the part about cleaning for OCR.
# Preprocess the text
# Create tokens
token <-
tokens(
my_corpus,
remove_numbers = TRUE,
remove_punct = TRUE,
remove_symbols = TRUE,
remove_url = TRUE,
split_hyphens = TRUE
)
# Don't think this is needed by why not
# Clean tokens created by OCR
token_ungd <- tokens_select(
token,
c("[\\d-]", "[[:punct:]]", "^.{1,2}$"),
selection = "remove",
valuetype = "regex",
verbose = TRUE
)
## removed 299 features
Create a Data Frequency Matrix, this time using Quanteda
Also, filter words that appear less than 7.5% and more than 90%. This rather conservative approach is possible because we have a sufficiently large corpus.
# Data frequency matrix
my_dfm <- dfm(token_ungd,
tolower = TRUE,
stem = TRUE,
remove = stopwords("english")
)
my_dfm_trim <-
dfm_trim(
my_dfm,
min_docfreq = 0.075,
# min 7.5%
max_docfreq = 0.90,
# max 90%
docfreq_type = "prop"
)
head(dfm_sort(my_dfm_trim, decreasing = TRUE, margin = "both"),
n = 10,
nf = 10)
## Document-feature matrix of: 10 documents, 10 features (5.0% sparse).
## features
## docs work model team learn develop busi use
## eda91b88eb3096ed98bc1a5f6b5568df 5 9 1 16 5 1 9
## ea67f8f42a6fa593ce7acefddd8aef2e 17 4 20 5 1 2 4
## 674c331993a36bb28fd4cf302ce66e9d 4 0 4 2 2 2 5
## a7c13d21920e860ddaefbb3085eece14 4 0 4 2 2 2 5
## 8d220fbda8e4ab36f10bba4ec8ff20ad 9 14 10 5 10 0 2
## de26a41473d841eb4f4e6c89a9c0c1fb 8 6 9 4 5 7 2
## features
## docs skill analyt statist
## eda91b88eb3096ed98bc1a5f6b5568df 3 6 9
## ea67f8f42a6fa593ce7acefddd8aef2e 6 2 0
## 674c331993a36bb28fd4cf302ce66e9d 6 2 1
## a7c13d21920e860ddaefbb3085eece14 6 2 1
## 8d220fbda8e4ab36f10bba4ec8ff20ad 4 1 2
## de26a41473d841eb4f4e6c89a9c0c1fb 6 4 3
## [ reached max_ndoc ... 4 more documents ]
# https://quanteda.io/articles/pkgdown/examples/plotting.html
#corpus_subset(my_corpus) %>%
# dfm(remove = stopwords("english"), remove_punct = TRUE) %>%
# dfm_trim(min_termfreq = 1, verbose = FALSE) %>%
# textplot_wordcloud(comparison = TRUE)
textplot_wordcloud(my_dfm, min_count = 10,
color = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))
kwic(my_corpus, pattern = "data") %>%
textplot_xray()
library("ggplot2")
theme_set(theme_bw())
g <- textplot_xray(
kwic(my_corpus, pattern = "python"),
kwic(my_corpus, pattern = "r")
)
g + aes(color = keyword) +
scale_color_manual(values = c("blue", "red")) +
theme(legend.position = "none")
# Frequency plots
features_dfm <- textstat_frequency(my_dfm, n = 50)
# Sort by reverse frequency order
features_dfm$feature <- with(features_dfm, reorder(feature, -frequency))
ggplot(features_dfm, aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
From: https://www.tidytextmining.com/
library(tidytext)
library(dplyr)
library(ggplot2)
library(tidyr)
jobs_df <- read.csv(file = 'data_scientist_united_states_job_postings_jobspikr.csv', stringsAsFactors = FALSE)
# For just testing purposes, just the first 100 rows only
jobs_df <- jobs_df[1:100,]
jobs_df <- jobs_df[,c('job_description', 'uniq_id')]
jobs_words <- jobs_df %>%
unnest_tokens(word, job_description) %>%
anti_join(stop_words) %>%
count(uniq_id, word, sort = TRUE)
## Joining, by = "word"
total_words <- jobs_words %>%
group_by(uniq_id) %>%
summarize(total = sum(n))
jobs_words <- left_join(jobs_words, total_words)
## Joining, by = "uniq_id"
jobs_words
## # A tibble: 16,453 x 4
## uniq_id word n total
## <chr> <chr> <int> <int>
## 1 674c331993a36bb28fd4cf302ce66e9d data 41 505
## 2 a7c13d21920e860ddaefbb3085eece14 data 41 505
## 3 625160c64fec3702a8c0c7ae97e77825 data 27 282
## 4 c10862d80f4bcf1b0b72bfb7f861776f data 27 431
## 5 99045ef17e641c9631cc2d5939e3fa0c data 25 328
## 6 de26a41473d841eb4f4e6c89a9c0c1fb data 25 483
## 7 eda91b88eb3096ed98bc1a5f6b5568df data 25 613
## 8 40ca3fab8f31010d1b5f35877508c0bf data 24 312
## 9 cbf0b6e8a77c62543a82b26fb06b34da data 23 431
## 10 972e897473d65f34b8e7f1c1b4c74b1c data 22 317
## # … with 16,443 more rows
Don’t see much value in this one. Probably just remove.
ggplot(jobs_words %>% filter(n > 20), aes(n / total, fill = uniq_id)) +
geom_histogram(show.legend = FALSE) +
xlim(0.0, 0.4) +
facet_wrap(~uniq_id, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing missing values (geom_bar).
Don’t see any value in this one. Probably just remove.
jobs_words <- jobs_words %>%
bind_tf_idf(word, uniq_id, n)
jobs_words
## # A tibble: 16,453 x 7
## uniq_id word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 674c331993a36bb28fd4cf302ce66e9d data 41 505 0.0812 0.0305 0.00247
## 2 a7c13d21920e860ddaefbb3085eece14 data 41 505 0.0812 0.0305 0.00247
## 3 625160c64fec3702a8c0c7ae97e77825 data 27 282 0.0957 0.0305 0.00292
## 4 c10862d80f4bcf1b0b72bfb7f861776f data 27 431 0.0626 0.0305 0.00191
## 5 99045ef17e641c9631cc2d5939e3fa0c data 25 328 0.0762 0.0305 0.00232
## 6 de26a41473d841eb4f4e6c89a9c0c1fb data 25 483 0.0518 0.0305 0.00158
## 7 eda91b88eb3096ed98bc1a5f6b5568df data 25 613 0.0408 0.0305 0.00124
## 8 40ca3fab8f31010d1b5f35877508c0bf data 24 312 0.0769 0.0305 0.00234
## 9 cbf0b6e8a77c62543a82b26fb06b34da data 23 431 0.0534 0.0305 0.00163
## 10 972e897473d65f34b8e7f1c1b4c74b1c data 22 317 0.0694 0.0305 0.00211
## # … with 16,443 more rows
jobs_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 16,453 x 6
## uniq_id word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 0ec629c03f3e82651711f2626c23cadb licensing 14 0.0601 4.61 0.277
## 2 87c7330ffd4a65d432eee4ca5955b971 food 6 0.0561 3.51 0.197
## 3 96b331bc3cb323f9fc8c43a5029687a6 tekvalley.com 2 0.0426 4.61 0.196
## 4 87c7330ffd4a65d432eee4ca5955b971 heinz 4 0.0374 4.61 0.172
## 5 87c7330ffd4a65d432eee4ca5955b971 kraft 4 0.0374 4.61 0.172
## 6 37c4c80252fb381734614164b6bf2c9b fedex 12 0.0373 4.61 0.172
## 7 e682d47f650976c7cfd182e10656003e crypto 6 0.0357 4.61 0.164
## 8 77d0b5f4638619632040c228a561f031 person 4 0.0548 2.66 0.146
## 9 3b6c6acfcba6135a31c83bd7ea493b18 farmers 9 0.0363 3.91 0.142
## 10 bfb0fdec92e9d40b47a26cec6b1fe457 validation 2 0.0444 3.00 0.133
## # … with 16,443 more rows
Don’t see value in below graph. Remove
jobs_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(uniq_id) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = uniq_id)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~uniq_id, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
Use bigrams of 2 to find the true word pairs that occur the most frequently
jobs_bigrams <- jobs_df %>%
unnest_tokens(bigram, job_description, token = "ngrams", n = 2)
jobs_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 20,584 x 2
## bigram n
## <chr> <int>
## 1 machine learning 153
## 2 experience with 143
## 3 data scientist 126
## 4 ability to 116
## 5 of the 112
## 6 data science 109
## 7 experience in 105
## 8 in a 81
## 9 in the 78
## 10 years of 78
## # … with 20,574 more rows
bigrams_separated <- jobs_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
# This result is valuable
bigram_counts %>% print(n = 100)
## # A tibble: 8,637 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 machine learning 153
## 2 data scientist 126
## 3 data science 109
## 4 computer science 48
## 5 data sets 46
## 6 data analysis 42
## 7 data mining 37
## 8 communication skills 33
## 9 job description 26
## 10 data analytics 25
## 11 data scientists 25
## 12 equal opportunity 25
## 13 neural networks 22
## 14 clinical data 21
## 15 data review 20
## 16 learning algorithms 20
## 17 data sources 19
## 18 data visualization 18
## 19 related field 18
## 20 deep learning 17
## 21 learning techniques 17
## 22 real world 17
## 23 sexual orientation 17
## 24 advanced analytics 16
## 25 national origin 16
## 26 opportunity employer 16
## 27 statistical analysis 16
## 28 veteran status 16
## 29 artificial intelligence 15
## 30 gender identity 15
## 31 quantitative field 15
## 32 software development 15
## 33 team player 15
## 34 orientation gender 14
## 35 unstructured data 14
## 36 data management 13
## 37 functional teams 13
## 38 operations research 13
## 39 scikit learn 13
## 40 senior data 13
## 41 solving skills 13
## 42 statistical modeling 13
## 43 complex data 12
## 44 decision trees 12
## 45 ideal candidate 12
## 46 predictive modeling 12
## 47 programming languages 12
## 48 qualified applicants 12
## 49 random forest 12
## 50 statistical methods 12
## 51 time series 12
## 52 affirmative action 11
## 53 analytical skills 11
## 54 cross functional 11
## 55 learning models 11
## 56 action employer 10
## 57 analyze data 10
## 58 bachelor’s degree 10
## 59 business intelligence 10
## 60 color religion 10
## 61 cutting edge 10
## 62 minimum qualifications 10
## 63 mining techniques 10
## 64 predictive analytics 10
## 65 statistical models 10
## 66 technical skills 10
## 67 business partners 9
## 68 data environment 9
## 69 development experience 9
## 70 engineering team 9
## 71 experience experience 9
## 72 feature engineering 9
## 73 master's degree 9
## 74 model development 9
## 75 product development 9
## 76 project management 9
## 77 race color 9
## 78 relevant field 9
## 79 scientist location 9
## 80 wide range 9
## 81 cloud resiliency 8
## 82 data engineering 8
## 83 data gathering 8
## 84 data models 8
## 85 data processing 8
## 86 data sciences 8
## 87 excellent written 8
## 88 experience developing 8
## 89 fast paced 8
## 90 highly collaborative 8
## 91 interpersonal skills 8
## 92 job summary 8
## 93 manipulating data 8
## 94 mathematics computer 8
## 95 mathematics statistics 8
## 96 operational data 8
## 97 preferred qualifications 8
## 98 public cloud 8
## 99 python sql 8
## 100 science mathematics 8
## # … with 8,537 more rows
Bigrams with 3 words occurring together to see if there is anything meaningful from that. This appears to be identifying boilerplate text that occurs for many listings but isn’t specific to th
jobs_df %>%
unnest_tokens(trigram, job_description, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
## # A tibble: 6,394 x 4
## word1 word2 word3 n
## <chr> <chr> <chr> <int>
## 1 machine learning algorithms 18
## 2 equal opportunity employer 16
## 3 machine learning techniques 16
## 4 sexual orientation gender 14
## 5 orientation gender identity 12
## 6 affirmative action employer 10
## 7 data mining techniques 10
## 8 senior data scientist 10
## 9 data scientist location 9
## 10 machine learning models 9
## # … with 6,384 more rows
Graph the bigrams
# Visualizing bigrams
library(igraph)
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 10) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 285e1be DN-- 79 55 --
## + attr: name (v/c), n (e/n)
## + edges from 285e1be (vertex names):
## [1] machine ->learning data ->scientist
## [3] data ->science computer ->science
## [5] data ->sets data ->analysis
## [7] data ->mining communication->skills
## [9] job ->description data ->analytics
## [11] data ->scientists equal ->opportunity
## [13] neural ->networks clinical ->data
## [15] data ->review learning ->algorithms
## + ... omitted several edges
Good graph here. Keep
library(ggraph)
set.seed(2020)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
Good graph here. Keep
set.seed(2021)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
Just a generic plot of the most common words that occur at least 25 times
jobs_words <- jobs_words %>%
anti_join(stop_words)
## Joining, by = "word"
jobs_words %>%
count(word, sort = TRUE)
## # A tibble: 3,824 x 2
## word n
## <chr> <int>
## 1 data 97
## 2 experience 94
## 3 python 82
## 4 learning 74
## 5 skills 74
## 6 team 72
## 7 analysis 71
## 8 machine 70
## 9 scientist 70
## 10 science 69
## # … with 3,814 more rows
jobs_words %>%
count(word, sort = TRUE) %>%
filter(n > 25) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
=== Just notes below
Education R programming python coding hadoop platform sql database/coding apache spark machine learning and AI data visualization unstructured data intellectual curiosity business acumen communication skills teamwork
communication business acumen data-drive problem solving data visualization programming R python tableau hadoop sql spark statistics mathematics